load packages

library(ggplot2)
library(data.table)
library(stringr)
library(dplyr)
library(tibble)
library(tidyr)
library(rprojroot)
library(Rtsne)
library(cowplot)

set paths and filenames

### files
long_table=sprintf("%s/data/mp142_TGVG1.1_MPA4_combined_abundance_table_longform1.tsv", find_rstudio_root_file())
metadata_table=sprintf("%s/data/some_teddy_MP142_metadata2.all_samples1.delivery.csv", find_rstudio_root_file())

load long data table, filter by SGBs detected in over 100 samples

long_dt <- fread(sprintf("%s", long_table), sep = "\t", header = T) %>%
  select(sampleID, rel_abundance, lineage) %>%
  mutate(kingdom = case_when(grepl("k__Bac", lineage) ~ "Bacteria", 
                             grepl("k__Vir", lineage) ~ "Virus",
                             grepl("k__Ar", lineage) ~ "Archea",
                             grepl("k__Euk", lineage) ~ "Eukaryota",
                             TRUE ~ "other")) %>%
  group_by(lineage) %>%
  filter(n() >= 100) %>%
  ungroup()

wide_bac_sp_dt <- long_dt %>%
  filter(kingdom == "Bacteria") %>%
  select(-kingdom) %>%
  pivot_wider(names_from = lineage, values_from = rel_abundance, values_fill = 0)

sampleID_l <- wide_bac_sp_dt$sampleID


wide_bac_sp_dt <- wide_bac_sp_dt %>% select(-sampleID)


wide_vir_sp_dt <- long_dt %>%
  filter(kingdom == "Virus") %>%
  select(-kingdom) %>%
  pivot_wider(names_from = lineage, values_from = rel_abundance, values_fill = 0)

sampleID_x <- wide_vir_sp_dt$sampleID


wide_vir_sp_dt <- wide_vir_sp_dt %>% select(-sampleID)

load metadata table

meta_dt <- fread(sprintf("%s", metadata_table), sep = ",", header = T) %>%
  select(-V1) %>%
  mutate(sample = as.character(sample))

run bacteria PCA

bac_prcomp1 <- prcomp(wide_bac_sp_dt)
names(bac_prcomp1)
[1] "sdev"     "rotation" "center"   "scale"    "x"       
#plot(bac_prcomp1$x[,1:2], pch = ".")

emb_bac <- Rtsne::Rtsne(bac_prcomp1$x[,1:20], perplexity = 20)

embb_bac <- emb_bac$Y

rownames(embb_bac) <- sampleID_l

embb_bac_dt <- as.data.frame(embb_bac) %>%
  setDT(., keep.rownames = "sampleID")

embb_bac_meta_dt <- merge(embb_bac_dt, meta_dt, by.x = "sampleID", by.y = "sample") %>%
  mutate(Country = case_when(
         country == 1 ~ "USA",
         country == 2 ~ "FIN",
         country == 3 ~ "GER",
         country == 4 ~ "SWE",
         TRUE ~ "other"
       ))

bac_dayp <- embb_bac_meta_dt %>%
  ggplot(aes(x=V1, y=V2, color=age_days, shape = Country)) +
  geom_point(size = 1.3, alpha = 0.5) +
  scale_color_gradient2(low = "maroon", mid = "grey", high = "#3F3F7B", 
                        midpoint = 500, limits = c(0,1400), na.value = "black", name = "Day of Life") +
  scale_shape_manual(values=c(15, 16, 17, 18)) +
  theme_bw() +
  labs(x = "tSNE-1", y = "tSNE-2", title = "tSNE of Bacteria in TEDDY samples")
bac_dayp


bac_T1Dp <- embb_bac_meta_dt %>%
  ggplot(aes(x=V1, y=V2, color=T1D, shape = Country)) +
  geom_point(size = 1.3, alpha = 0.5) +
  scale_color_manual(values=c("#8CBEB1", "red")) +
  scale_shape_manual(values=c(15, 16, 17, 18)) +
  theme_bw() +
  labs(x = "tSNE-1", y = "tSNE-2", title = "tSNE of Bacteria in TEDDY samples")
bac_T1Dp

run virus PCA

vir_prcomp1 <- prcomp(wide_vir_sp_dt)

#plot(vir_prcomp1$x[,1:2], pch = ".")

emb_vir <- Rtsne::Rtsne(vir_prcomp1$x[,1:20], perplexity = 20)

emb_vir <- emb_vir$Y

rownames(emb_vir) <- sampleID_x

embb_vir_dt <- as.data.frame(emb_vir) %>%
  setDT(., keep.rownames = "sampleID")

embb_vir_meta_dt <- merge(embb_vir_dt, meta_dt, by.x = "sampleID", by.y = "sample") %>%
  mutate(Country = case_when(
         country == 1 ~ "USA",
         country == 2 ~ "FIN",
         country == 3 ~ "GER",
         country == 4 ~ "SWE",
         TRUE ~ "other"
       ))

vir_dayp <- embb_vir_meta_dt %>%
  ggplot(aes(x=V1, y=V2, color=age_days, shape = Country)) +
  geom_point(size = 1.3, alpha = 0.5) +
  scale_color_gradient2(low = "maroon", mid = "grey", high = "#3F3F7B", 
                        midpoint = 500, limits = c(0,1400), na.value = "black", name = "Day of Life") +
  scale_shape_manual(values=c(15, 16, 17, 18)) +
  theme_bw() +
  labs(x = "tSNE-1", y = "tSNE-2", title = "tSNE of Viruses in TEDDY samples")
vir_dayp


vir_T1Dp <- embb_vir_meta_dt %>%
  ggplot(aes(x=V1, y=V2, color=T1D, shape = Country)) +
  geom_point(size = 1.3, alpha = 0.5) +
  scale_color_manual(values=c("#8CBEB1", "red")) +
  scale_shape_manual(values=c(15, 16, 17, 18)) +
  theme_bw() +
  labs(x = "tSNE-1", y = "tSNE-2", title = "tSNE of Viruses in TEDDY samples")
vir_T1Dp

NA
NA

combine and save

tSNE_combp <- plot_grid(bac_dayp, bac_T1Dp, vir_dayp, vir_T1Dp, align = "h", nrow = 2)

ggsave(tSNE_combp, file = sprintf("%s/charts/tSNE_bacteria_vs_virus_all_samples.pdf", find_rstudio_root_file()), width = 8, height = 8)
LS0tCnRpdGxlOiAiTWFrZSB0U05FIGZvciBiYWN0ZXJpYSBhbmQgdmlydXMgU0dCcywgY29sb3IgYnkgbWV0YWRhdGEiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmxvYWQgcGFja2FnZXMKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShkYXRhLnRhYmxlKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGliYmxlKQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KHJwcm9qcm9vdCkKbGlicmFyeShSdHNuZSkKbGlicmFyeShjb3dwbG90KQoKYGBgCgpzZXQgcGF0aHMgYW5kIGZpbGVuYW1lcwpgYGB7cn0KIyMjIGZpbGVzCmxvbmdfdGFibGU9c3ByaW50ZigiJXMvZGF0YS9tcDE0Ml9UR1ZHMS4xX01QQTRfY29tYmluZWRfYWJ1bmRhbmNlX3RhYmxlX2xvbmdmb3JtMS50c3YiLCBmaW5kX3JzdHVkaW9fcm9vdF9maWxlKCkpCm1ldGFkYXRhX3RhYmxlPXNwcmludGYoIiVzL2RhdGEvc29tZV90ZWRkeV9NUDE0Ml9tZXRhZGF0YTIuYWxsX3NhbXBsZXMxLmRlbGl2ZXJ5LmNzdiIsIGZpbmRfcnN0dWRpb19yb290X2ZpbGUoKSkKCmBgYAoKbG9hZCBsb25nIGRhdGEgdGFibGUsIGZpbHRlciBieSBTR0JzIGRldGVjdGVkIGluIG92ZXIgMTAwIHNhbXBsZXMKYGBge3J9CmxvbmdfZHQgPC0gZnJlYWQoc3ByaW50ZigiJXMiLCBsb25nX3RhYmxlKSwgc2VwID0gIlx0IiwgaGVhZGVyID0gVCkgJT4lCiAgc2VsZWN0KHNhbXBsZUlELCByZWxfYWJ1bmRhbmNlLCBsaW5lYWdlKSAlPiUKICBtdXRhdGUoa2luZ2RvbSA9IGNhc2Vfd2hlbihncmVwbCgia19fQmFjIiwgbGluZWFnZSkgfiAiQmFjdGVyaWEiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBncmVwbCgia19fVmlyIiwgbGluZWFnZSkgfiAiVmlydXMiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIGdyZXBsKCJrX19BciIsIGxpbmVhZ2UpIH4gIkFyY2hlYSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZ3JlcGwoImtfX0V1ayIsIGxpbmVhZ2UpIH4gIkV1a2FyeW90YSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgVFJVRSB+ICJvdGhlciIpKSAlPiUKICBncm91cF9ieShsaW5lYWdlKSAlPiUKICBmaWx0ZXIobigpID49IDEwMCkgJT4lCiAgdW5ncm91cCgpCgp3aWRlX2JhY19zcF9kdCA8LSBsb25nX2R0ICU+JQogIGZpbHRlcihraW5nZG9tID09ICJCYWN0ZXJpYSIpICU+JQogIHNlbGVjdCgta2luZ2RvbSkgJT4lCiAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbSA9IGxpbmVhZ2UsIHZhbHVlc19mcm9tID0gcmVsX2FidW5kYW5jZSwgdmFsdWVzX2ZpbGwgPSAwKQoKc2FtcGxlSURfbCA8LSB3aWRlX2JhY19zcF9kdCRzYW1wbGVJRAoKCndpZGVfYmFjX3NwX2R0IDwtIHdpZGVfYmFjX3NwX2R0ICU+JSBzZWxlY3QoLXNhbXBsZUlEKQoKCndpZGVfdmlyX3NwX2R0IDwtIGxvbmdfZHQgJT4lCiAgZmlsdGVyKGtpbmdkb20gPT0gIlZpcnVzIikgJT4lCiAgc2VsZWN0KC1raW5nZG9tKSAlPiUKICBwaXZvdF93aWRlcihuYW1lc19mcm9tID0gbGluZWFnZSwgdmFsdWVzX2Zyb20gPSByZWxfYWJ1bmRhbmNlLCB2YWx1ZXNfZmlsbCA9IDApCgpzYW1wbGVJRF94IDwtIHdpZGVfdmlyX3NwX2R0JHNhbXBsZUlECgoKd2lkZV92aXJfc3BfZHQgPC0gd2lkZV92aXJfc3BfZHQgJT4lIHNlbGVjdCgtc2FtcGxlSUQpCmBgYAoKbG9hZCBtZXRhZGF0YSB0YWJsZQpgYGB7cn0KbWV0YV9kdCA8LSBmcmVhZChzcHJpbnRmKCIlcyIsIG1ldGFkYXRhX3RhYmxlKSwgc2VwID0gIiwiLCBoZWFkZXIgPSBUKSAlPiUKICBzZWxlY3QoLVYxKSAlPiUKICBtdXRhdGUoc2FtcGxlID0gYXMuY2hhcmFjdGVyKHNhbXBsZSkpCmBgYAoKcnVuIGJhY3RlcmlhIFBDQQpgYGB7cn0KYmFjX3ByY29tcDEgPC0gcHJjb21wKHdpZGVfYmFjX3NwX2R0KQpuYW1lcyhiYWNfcHJjb21wMSkKI3Bsb3QoYmFjX3ByY29tcDEkeFssMToyXSwgcGNoID0gIi4iKQoKZW1iX2JhYyA8LSBSdHNuZTo6UnRzbmUoYmFjX3ByY29tcDEkeFssMToyMF0sIHBlcnBsZXhpdHkgPSAyMCkKCmVtYmJfYmFjIDwtIGVtYl9iYWMkWQoKcm93bmFtZXMoZW1iYl9iYWMpIDwtIHNhbXBsZUlEX2wKCmVtYmJfYmFjX2R0IDwtIGFzLmRhdGEuZnJhbWUoZW1iYl9iYWMpICU+JQogIHNldERUKC4sIGtlZXAucm93bmFtZXMgPSAic2FtcGxlSUQiKQoKZW1iYl9iYWNfbWV0YV9kdCA8LSBtZXJnZShlbWJiX2JhY19kdCwgbWV0YV9kdCwgYnkueCA9ICJzYW1wbGVJRCIsIGJ5LnkgPSAic2FtcGxlIikgJT4lCiAgbXV0YXRlKENvdW50cnkgPSBjYXNlX3doZW4oCiAgICAgICAgIGNvdW50cnkgPT0gMSB+ICJVU0EiLAogICAgICAgICBjb3VudHJ5ID09IDIgfiAiRklOIiwKICAgICAgICAgY291bnRyeSA9PSAzIH4gIkdFUiIsCiAgICAgICAgIGNvdW50cnkgPT0gNCB+ICJTV0UiLAogICAgICAgICBUUlVFIH4gIm90aGVyIgogICAgICAgKSkKCmJhY19kYXlwIDwtIGVtYmJfYmFjX21ldGFfZHQgJT4lCiAgZ2dwbG90KGFlcyh4PVYxLCB5PVYyLCBjb2xvcj1hZ2VfZGF5cywgc2hhcGUgPSBDb3VudHJ5KSkgKwogIGdlb21fcG9pbnQoc2l6ZSA9IDEuMywgYWxwaGEgPSAwLjUpICsKICBzY2FsZV9jb2xvcl9ncmFkaWVudDIobG93ID0gIm1hcm9vbiIsIG1pZCA9ICJncmV5IiwgaGlnaCA9ICIjM0YzRjdCIiwgCiAgICAgICAgICAgICAgICAgICAgICAgIG1pZHBvaW50ID0gNTAwLCBsaW1pdHMgPSBjKDAsMTQwMCksIG5hLnZhbHVlID0gImJsYWNrIiwgbmFtZSA9ICJEYXkgb2YgTGlmZSIpICsKICBzY2FsZV9zaGFwZV9tYW51YWwodmFsdWVzPWMoMTUsIDE2LCAxNywgMTgpKSArCiAgdGhlbWVfYncoKSArCiAgbGFicyh4ID0gInRTTkUtMSIsIHkgPSAidFNORS0yIiwgdGl0bGUgPSAidFNORSBvZiBCYWN0ZXJpYSBpbiBURUREWSBzYW1wbGVzIikKYmFjX2RheXAKCmJhY19UMURwIDwtIGVtYmJfYmFjX21ldGFfZHQgJT4lCiAgZ2dwbG90KGFlcyh4PVYxLCB5PVYyLCBjb2xvcj1UMUQsIHNoYXBlID0gQ291bnRyeSkpICsKICBnZW9tX3BvaW50KHNpemUgPSAxLjMsIGFscGhhID0gMC41KSArCiAgc2NhbGVfY29sb3JfbWFudWFsKHZhbHVlcz1jKCIjOENCRUIxIiwgInJlZCIpKSArCiAgc2NhbGVfc2hhcGVfbWFudWFsKHZhbHVlcz1jKDE1LCAxNiwgMTcsIDE4KSkgKwogIHRoZW1lX2J3KCkgKwogIGxhYnMoeCA9ICJ0U05FLTEiLCB5ID0gInRTTkUtMiIsIHRpdGxlID0gInRTTkUgb2YgQmFjdGVyaWEgaW4gVEVERFkgc2FtcGxlcyIpCmJhY19UMURwCgpgYGAKCgpydW4gdmlydXMgUENBCmBgYHtyfQp2aXJfcHJjb21wMSA8LSBwcmNvbXAod2lkZV92aXJfc3BfZHQpCgojcGxvdCh2aXJfcHJjb21wMSR4WywxOjJdLCBwY2ggPSAiLiIpCgplbWJfdmlyIDwtIFJ0c25lOjpSdHNuZSh2aXJfcHJjb21wMSR4WywxOjIwXSwgcGVycGxleGl0eSA9IDIwKQoKZW1iX3ZpciA8LSBlbWJfdmlyJFkKCnJvd25hbWVzKGVtYl92aXIpIDwtIHNhbXBsZUlEX3gKCmVtYmJfdmlyX2R0IDwtIGFzLmRhdGEuZnJhbWUoZW1iX3ZpcikgJT4lCiAgc2V0RFQoLiwga2VlcC5yb3duYW1lcyA9ICJzYW1wbGVJRCIpCgplbWJiX3Zpcl9tZXRhX2R0IDwtIG1lcmdlKGVtYmJfdmlyX2R0LCBtZXRhX2R0LCBieS54ID0gInNhbXBsZUlEIiwgYnkueSA9ICJzYW1wbGUiKSAlPiUKICBtdXRhdGUoQ291bnRyeSA9IGNhc2Vfd2hlbigKICAgICAgICAgY291bnRyeSA9PSAxIH4gIlVTQSIsCiAgICAgICAgIGNvdW50cnkgPT0gMiB+ICJGSU4iLAogICAgICAgICBjb3VudHJ5ID09IDMgfiAiR0VSIiwKICAgICAgICAgY291bnRyeSA9PSA0IH4gIlNXRSIsCiAgICAgICAgIFRSVUUgfiAib3RoZXIiCiAgICAgICApKQoKdmlyX2RheXAgPC0gZW1iYl92aXJfbWV0YV9kdCAlPiUKICBnZ3Bsb3QoYWVzKHg9VjEsIHk9VjIsIGNvbG9yPWFnZV9kYXlzLCBzaGFwZSA9IENvdW50cnkpKSArCiAgZ2VvbV9wb2ludChzaXplID0gMS4zLCBhbHBoYSA9IDAuNSkgKwogIHNjYWxlX2NvbG9yX2dyYWRpZW50Mihsb3cgPSAibWFyb29uIiwgbWlkID0gImdyZXkiLCBoaWdoID0gIiMzRjNGN0IiLCAKICAgICAgICAgICAgICAgICAgICAgICAgbWlkcG9pbnQgPSA1MDAsIGxpbWl0cyA9IGMoMCwxNDAwKSwgbmEudmFsdWUgPSAiYmxhY2siLCBuYW1lID0gIkRheSBvZiBMaWZlIikgKwogIHNjYWxlX3NoYXBlX21hbnVhbCh2YWx1ZXM9YygxNSwgMTYsIDE3LCAxOCkpICsKICB0aGVtZV9idygpICsKICBsYWJzKHggPSAidFNORS0xIiwgeSA9ICJ0U05FLTIiLCB0aXRsZSA9ICJ0U05FIG9mIFZpcnVzZXMgaW4gVEVERFkgc2FtcGxlcyIpCnZpcl9kYXlwCgp2aXJfVDFEcCA8LSBlbWJiX3Zpcl9tZXRhX2R0ICU+JQogIGdncGxvdChhZXMoeD1WMSwgeT1WMiwgY29sb3I9VDFELCBzaGFwZSA9IENvdW50cnkpKSArCiAgZ2VvbV9wb2ludChzaXplID0gMS4zLCBhbHBoYSA9IDAuNSkgKwogIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiIzhDQkVCMSIsICJyZWQiKSkgKwogIHNjYWxlX3NoYXBlX21hbnVhbCh2YWx1ZXM9YygxNSwgMTYsIDE3LCAxOCkpICsKICB0aGVtZV9idygpICsKICBsYWJzKHggPSAidFNORS0xIiwgeSA9ICJ0U05FLTIiLCB0aXRsZSA9ICJ0U05FIG9mIFZpcnVzZXMgaW4gVEVERFkgc2FtcGxlcyIpCnZpcl9UMURwCgoKYGBgCgpjb21iaW5lIGFuZCBzYXZlCmBgYHtyfQp0U05FX2NvbWJwIDwtIHBsb3RfZ3JpZChiYWNfZGF5cCwgYmFjX1QxRHAsIHZpcl9kYXlwLCB2aXJfVDFEcCwgYWxpZ24gPSAiaCIsIG5yb3cgPSAyKQoKZ2dzYXZlKHRTTkVfY29tYnAsIGZpbGUgPSBzcHJpbnRmKCIlcy9jaGFydHMvdFNORV9iYWN0ZXJpYV92c192aXJ1c19hbGxfc2FtcGxlcy5wZGYiLCBmaW5kX3JzdHVkaW9fcm9vdF9maWxlKCkpLCB3aWR0aCA9IDgsIGhlaWdodCA9IDgpCgpgYGAKCgoKCgoKCg==